library(ngsReports)
library(magrittr)
library(scales)
library(pander)
library(tidyverse)
theme_set(theme_bw())
options(scipen = 999)
if (interactive()) setwd(here::here("R"))
deMuxFqc <- list.files("../3_demuxTrimmed/FastQC/", pattern = "zip", full.names = TRUE) %>%
FastqcDataList()
alnFqc <- list.files("../4_aligned/FastQC/", pattern = "zip", full.names = TRUE) %>%
FastqcDataList()
oryGC <- read_rds("oryGC.RDS")
Alignments were filtered for:
SA tag added by bwa.MAPQ score, this equates to an approximate \(p = 0.001\) for an incorrect alignment.list(
readTotals(deMuxFqc) %>%
mutate(Sample = str_remove(Filename, ".[12].fq.gz")) %>%
group_by(Sample) %>%
summarise(Total_Sequences = sum(Total_Sequences)) %>%
mutate(Type = "Pre-Alignment"),
readTotals(alnFqc) %>%
mutate(Sample = str_remove(Filename, ".bam"),
Type = "Post-Alignment") %>%
dplyr::select(Sample, Type, Total_Sequences)
) %>%
bind_rows() %>%
mutate(Population = case_when(
grepl("gc", Sample) ~ "1996",
grepl("ora", Sample) ~ "2012",
!grepl("(gc|ora)", Sample) ~ "2010"
)) %>%
# filter(Population != "2010") %>%
ggplot(aes(Sample, Total_Sequences, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
facet_wrap(~Population, scales = "free") +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
Comparison of library sizes before and after alignment. Given the aggressive filtering of alignments, this shows an acceptable rate of alignment across all samples
list(
readTotals(deMuxFqc) %>%
mutate(Sample = str_remove(Filename, ".[12].fq.gz")) %>%
group_by(Sample) %>%
summarise(Total_Sequences = sum(Total_Sequences)) %>%
mutate(Type = "Pre-Alignment"),
readTotals(alnFqc) %>%
mutate(
Sample = str_remove(Filename, ".bam"),
Type = "Post-Alignment"
) %>%
dplyr::select(Sample, Type, Total_Sequences)
) %>%
bind_rows() %>%
mutate(
Population = case_when(
grepl("gc", Sample) ~ "1996",
grepl("ora", Sample) ~ "2012",
!grepl("(gc|ora)", Sample) ~ "2010"
)
) %>%
# filter(Population != "2010") %>%
spread(key = Type, value = Total_Sequences) %>%
mutate(Rate = `Post-Alignment`/`Pre-Alignment`) %>%
ggplot(aes(Sample, Rate, fill = Population)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(~Population, scales = "free_y") +
scale_y_continuous(label = percent, expand = expand_scale(c(0, 0.05))) +
labs(y = "Alignment Rate")
Alignment rate for each sample after filtering for uniquely mapped reads with high mapping qualities. The two samples gc2700 and gc2709 appear to be outliers.
readTotals(alnFqc) %>%
mutate(
Sample = str_remove(Filename, ".bam"),
Population = case_when(
grepl("gc", Sample) ~ "1996",
grepl("ora", Sample) ~ "2012",
!grepl("(gc|ora)", Sample) ~ "2010"
)
) %>%
dplyr::select(Sample, Population, Total_Sequences) %>%
group_by(Population) %>%
summarise(
Samples = n(),
`Smallest Library` = min(Total_Sequences),
`Median Library` = median(Total_Sequences),
`Largest Library` = max(Total_Sequences),
`Total Alignments` = sum(Total_Sequences)
) %>%
pander(
big.mark = ",",
split.tables = Inf,
style = "rmarkdown",
justify = "rrrrrr",
caption = "*Summary of Library Sizes After Alignment*"
)
| Population | Samples | Smallest Library | Median Library | Largest Library | Total Alignments |
|---|---|---|---|---|---|
| 1996 | 59 | 1,541,386 | 5,253,757 | 16,146,550 | 351,966,386 |
| 2010 | 37 | 1,264,888 | 4,141,755 | 7,670,908 | 163,491,029 |
| 2012 | 53 | 3,411,367 | 6,740,247 | 14,398,379 | 377,550,470 |
plotGcContent(alnFqc, usePlotly = TRUE, dendrogram = TRUE, theoreticalGC = TRUE, GCobject = oryGC, species = "Ocuniculus")
Difference in GC content of alignments when comparing each sample to Theoretical GC content from sampling the O. cuniculus genome.
alnFqc %>%
plotGcContent(plotType = "line",usePlotly = TRUE, GCobject = oryGC, species = "Ocuniculus")
GC content of the complete set of samples. The three outlier samples are clearly evident.
lowQ <- paste(c("pt1125", "gc2709", "gc2700"), "bam", sep = ".")
Potential low quality samples were identified by GC content as pt1125.bam, gc2709.bam and gc2700.bam. Alignments from these samples should be moved and placed into a separate folder to ensure their exclusion from the stacks pipeline. The sample gc2776 was also of some concern, but was retained for downstream analysis.
Inspection of the per base sequence content confirmed the divergent patterns of these samples.
o <- order(fqName(alnFqc))
plotSeqContent(alnFqc[o])
Per base sequence content of all samples, with divergent patterns clearly identifying the outlier samples
All identified low quality samples were manually moved to a sub-folder indicating their low quality and excluded from the stacks pipeline.
alnFqcPass <- alnFqc[!fqName(alnFqc) %in% lowQ]
list(
readTotals(deMuxFqc) %>%
mutate(Sample = str_remove(Filename, ".[12].fq.gz")) %>%
group_by(Sample) %>%
summarise(Total_Sequences = sum(Total_Sequences)) %>%
mutate(Type = "Pre-Alignment"),
readTotals(alnFqcPass) %>%
mutate(
Sample = str_remove(Filename, ".bam"),
Type = "Post-Alignment") %>%
dplyr::select(Sample, Type, Total_Sequences)
) %>%
bind_rows() %>%
mutate(
Population = case_when(
grepl("gc", Sample) ~ "1996",
grepl("ora", Sample) ~ "2012",
!grepl("(gc|ora)", Sample) ~ "2010"
)
) %>%
# filter(Population != "2010") %>%
spread(key = Type, value = Total_Sequences) %>%
dplyr::filter(`Post-Alignment` > 0) %>%
mutate(
`Alignment Rate` = `Post-Alignment` / `Pre-Alignment`,
) %>%
group_by(Population) %>%
summarise(
`Pre-Alignment` = sum(`Pre-Alignment`),
`Post-Alignment` = sum(`Post-Alignment`),
minRate = min(`Alignment Rate`),
maxRate = max(`Alignment Rate`)
) %>%
bind_rows(
tibble(
Population = "Total",
`Post-Alignment` = sum(.$`Post-Alignment`),
`Pre-Alignment` = sum(.$`Pre-Alignment`)
)
) %>%
mutate(
`Alignment Rate` = `Post-Alignment` / `Pre-Alignment`,
) %>%
mutate_at(
vars(contains("Rate")), ~percent(., accuracy = 0.01)
) %>%
dplyr::select(
Population, `Pre-Alignment`, contains("Align"), everything()
) %>%
dplyr::rename(Worst = minRate, Best = maxRate) %>%
pander(
big.mark = ",",
style = "rmarkdown",
justify = "lrrrrr",
missing = "",
emphasize.strong.rows = nrow(.),
split.table = Inf,
caption = "*Summary of alignment rates overall and by population, after exclusion of low quality samples.*"
)
| Population | Pre-Alignment | Post-Alignment | Alignment Rate | Worst | Best |
|---|---|---|---|---|---|
| 1996 | 420,870,972 | 345,681,486 | 82.13% | 65.73% | 85.97% |
| 2010 | 191,105,136 | 161,676,155 | 84.60% | 80.43% | 86.21% |
| 2012 | 455,559,218 | 377,550,470 | 82.88% | 74.09% | 86.30% |
| Total | 1,067,535,326 | 884,908,111 | 82.89% |
R version 3.6.2 (2019-12-12)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C
attached base packages: parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.3), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.0), tidyverse(v.1.3.0), pander(v.0.6.3), scales(v.1.1.0), magrittr(v.1.5), ngsReports(v.1.1.2), tibble(v.2.1.3), ggplot2(v.3.2.1) and BiocGenerics(v.0.32.0)
loaded via a namespace (and not attached): colorspace(v.1.4-1), hwriter(v.1.3.2), ellipsis(v.0.3.0), XVector(v.0.26.0), GenomicRanges(v.1.38.0), ggdendro(v.0.1-20), fs(v.1.3.1), rstudioapi(v.0.10), farver(v.2.0.3), ggrepel(v.0.8.1), fansi(v.0.4.1), lubridate(v.1.7.4), xml2(v.1.2.2), leaps(v.3.1), knitr(v.1.27), zeallot(v.0.1.0), jsonlite(v.1.6), Rsamtools(v.2.2.1), Cairo(v.1.5-10), broom(v.0.5.3), cluster(v.2.1.0), dbplyr(v.1.4.2), png(v.0.1-7), shiny(v.1.4.0), compiler(v.3.6.2), httr(v.1.4.1), backports(v.1.1.5), fastmap(v.1.0.1), assertthat(v.0.2.1), Matrix(v.1.2-18), lazyeval(v.0.2.2), cli(v.2.0.1), later(v.1.0.0), htmltools(v.0.4.0), tools(v.3.6.2), gtable(v.0.3.0), glue(v.1.3.1), GenomeInfoDbData(v.1.2.2), reshape2(v.1.4.3), FactoMineR(v.2.1), ShortRead(v.1.44.1), Rcpp(v.1.0.3), Biobase(v.2.46.0), cellranger(v.1.1.0), vctrs(v.0.2.1), Biostrings(v.2.54.0), nlme(v.3.1-143), crosstalk(v.1.0.0), xfun(v.0.12), rvest(v.0.3.5), mime(v.0.8), lifecycle(v.0.1.0), zlibbioc(v.1.32.0), MASS(v.7.3-51.5), zoo(v.1.8-7), promises(v.1.1.0), hms(v.0.5.3), SummarizedExperiment(v.1.16.1), RColorBrewer(v.1.1-2), yaml(v.2.2.0), latticeExtra(v.0.6-29), stringi(v.1.4.5), highr(v.0.8), S4Vectors(v.0.24.3), BiocParallel(v.1.20.1), truncnorm(v.1.0-8), GenomeInfoDb(v.1.22.0), rlang(v.0.4.2), pkgconfig(v.2.0.3), matrixStats(v.0.55.0), bitops(v.1.0-6), evaluate(v.0.14), lattice(v.0.20-38), GenomicAlignments(v.1.22.1), htmlwidgets(v.1.5.1), labeling(v.0.3), tidyselect(v.0.2.5), plyr(v.1.8.5), R6(v.2.4.1), IRanges(v.2.20.2), generics(v.0.0.2), DelayedArray(v.0.12.2), DBI(v.1.1.0), pillar(v.1.4.3), haven(v.2.2.0), withr(v.2.1.2), scatterplot3d(v.0.3-41), RCurl(v.1.98-1.1), modelr(v.0.1.5), crayon(v.1.3.4), plotly(v.4.9.1), rmarkdown(v.2.1), jpeg(v.0.1-8.1), grid(v.3.6.2), readxl(v.1.3.1), data.table(v.1.12.8), reprex(v.0.3.0), digest(v.0.6.23), flashClust(v.1.01-2), webshot(v.0.5.2), xtable(v.1.8-4), httpuv(v.1.5.2), stats4(v.3.6.2), munsell(v.0.5.0), viridisLite(v.0.3.0) and kableExtra(v.1.1.0)